#include <transpose_ops.h>
template <typename T>
void transpose_cpu(T *d_out, T *d_in, const unsigned int rows,
                   const unsigned int cols) {
    for (int c = 0; c < cols; c++) {
        for (int r = 0; r < rows; r++) {

            d_out[c * rows + r] = d_in[r * cols + c];
        }
    }
}
template void transpose_cpu<float>(float *, float *, const unsigned int,
                                   const unsigned int);
template void transpose_cpu<int>(int *, int *, const unsigned int,
                                 const unsigned int);
